{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "“austen-emma.txt”的文本长度为887071, 词汇数量为192427, 句子数量为7752\n",
      "“austen-persuasion.txt”的文本长度为466292, 词汇数量为98171, 句子数量为3747\n",
      "“austen-sense.txt”的文本长度为673022, 词汇数量为141576, 句子数量为4999\n",
      "“bible-kjv.txt”的文本长度为4332554, 词汇数量为1010654, 句子数量为30103\n",
      "“blake-poems.txt”的文本长度为38153, 词汇数量为8354, 句子数量为438\n",
      "“bryant-stories.txt”的文本长度为249439, 词汇数量为55563, 句子数量为2863\n",
      "“burgess-busterbrown.txt”的文本长度为84663, 词汇数量为18963, 句子数量为1054\n",
      "“carroll-alice.txt”的文本长度为144395, 词汇数量为34110, 句子数量为1703\n",
      "“chesterton-ball.txt”的文本长度为457450, 词汇数量为96996, 句子数量为4779\n",
      "“chesterton-brown.txt”的文本长度为406629, 词汇数量为86063, 句子数量为3806\n",
      "“chesterton-thursday.txt”的文本长度为320525, 词汇数量为69213, 句子数量为3742\n",
      "“edgeworth-parents.txt”的文本长度为935158, 词汇数量为210663, 句子数量为10230\n",
      "“melville-moby_dick.txt”的文本长度为1242990, 词汇数量为260819, 句子数量为10059\n",
      "“milton-paradise.txt”的文本长度为468220, 词汇数量为96825, 句子数量为1851\n",
      "“shakespeare-caesar.txt”的文本长度为112310, 词汇数量为25833, 句子数量为2163\n",
      "“shakespeare-hamlet.txt”的文本长度为162881, 词汇数量为37360, 句子数量为3106\n",
      "“shakespeare-macbeth.txt”的文本长度为100351, 词汇数量为23140, 句子数量为1907\n",
      "“whitman-leaves.txt”的文本长度为711215, 词汇数量为154883, 句子数量为4250\n"
     ]
    }
   ],
   "source": [
    "from nltk.corpus import gutenberg\n",
    "for fileid in gutenberg.fileids():\n",
    "    raw = gutenberg.raw(fileid)\n",
    "    num_length = len(raw)\n",
    "    words = gutenberg.words(fileid)\n",
    "    num_words = len(words)\n",
    "    sents = gutenberg.sents(fileid)\n",
    "    num_sents = len(sents)\n",
    "    print(\"“%s”的文本长度为%d, 词汇数量为%d, 句子数量为%d\"%(fileid,num_length, num_words, num_sents))\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "布朗语料库的类别:\n",
      "['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']\n",
      "布朗语料库“news”类别中的文件:\n",
      "['ca01', 'ca02', 'ca03', 'ca04', 'ca05', 'ca06', 'ca07', 'ca08', 'ca09', 'ca10', 'ca11', 'ca12', 'ca13', 'ca14', 'ca15', 'ca16', 'ca17', 'ca18', 'ca19', 'ca20', 'ca21', 'ca22', 'ca23', 'ca24', 'ca25', 'ca26', 'ca27', 'ca28', 'ca29', 'ca30', 'ca31', 'ca32', 'ca33', 'ca34', 'ca35', 'ca36', 'ca37', 'ca38', 'ca39', 'ca40', 'ca41', 'ca42', 'ca43', 'ca44']\n",
      "布朗语料库“news”类别中的词汇:\n",
      "['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]\n",
      "布朗语料库“news”类别中的句子:\n",
      "[['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', \"Atlanta's\", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', \"''\", 'that', 'any', 'irregularities', 'took', 'place', '.'], ['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', \"''\", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.'], ...]\n"
     ]
    }
   ],
   "source": [
    "from nltk.corpus import brown\n",
    "print (\"布朗语料库的类别:\")\n",
    "print(brown.categories())\n",
    "print(\"布朗语料库“news”类别中的文件:\")\n",
    "print(brown.fileids(categories = 'news'))\n",
    "print(\"布朗语料库“news”类别中的词汇:\")\n",
    "print(brown.words(categories = 'news'))\n",
    "print(\"布朗语料库“news”类别中的句子:\")\n",
    "print(brown.sents(categories = 'news'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "路透社语料库中的前5个文件:\n"
     ]
    },
    {
     "ename": "LookupError",
     "evalue": "\n**********************************************************************\n  Resource \u001b[93mreuters\u001b[0m not found.\n  Please use the NLTK Downloader to obtain the resource:\n\n  \u001b[31m>>> import nltk\n  >>> nltk.download('reuters')\n  \u001b[0m\n  Searched in:\n    - 'C:\\\\Users\\\\Administrator/nltk_data'\n    - 'C:\\\\nltk_data'\n    - 'D:\\\\nltk_data'\n    - 'E:\\\\nltk_data'\n    - 'C:\\\\ProgramData\\\\Anaconda3\\\\nltk_data'\n    - 'C:\\\\ProgramData\\\\Anaconda3\\\\share\\\\nltk_data'\n    - 'C:\\\\ProgramData\\\\Anaconda3\\\\lib\\\\nltk_data'\n    - 'C:\\\\Users\\\\Administrator\\\\AppData\\\\Roaming\\\\nltk_data'\n**********************************************************************\n",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mLookupError\u001b[0m                               Traceback (most recent call last)",
      "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\nltk\\corpus\\util.py\u001b[0m in \u001b[0;36m__load\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m     79\u001b[0m             \u001b[1;32mexcept\u001b[0m \u001b[0mLookupError\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 80\u001b[1;33m                 \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mroot\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnltk\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfind\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'{}/{}'\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msubdir\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mzip_name\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     81\u001b[0m                 \u001b[1;32mexcept\u001b[0m \u001b[0mLookupError\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;32mraise\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\nltk\\data.py\u001b[0m in \u001b[0;36mfind\u001b[1;34m(resource_name, paths)\u001b[0m\n\u001b[0;32m    674\u001b[0m     \u001b[0mresource_not_found\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;34m'\\n%s\\n%s\\n%s\\n'\u001b[0m \u001b[1;33m%\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0msep\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmsg\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msep\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 675\u001b[1;33m     \u001b[1;32mraise\u001b[0m \u001b[0mLookupError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mresource_not_found\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    676\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mLookupError\u001b[0m: \n**********************************************************************\n  Resource \u001b[93mreuters\u001b[0m not found.\n  Please use the NLTK Downloader to obtain the resource:\n\n  \u001b[31m>>> import nltk\n  >>> nltk.download('reuters')\n  \u001b[0m\n  Searched in:\n    - 'C:\\\\Users\\\\Administrator/nltk_data'\n    - 'C:\\\\nltk_data'\n    - 'D:\\\\nltk_data'\n    - 'E:\\\\nltk_data'\n    - 'C:\\\\ProgramData\\\\Anaconda3\\\\nltk_data'\n    - 'C:\\\\ProgramData\\\\Anaconda3\\\\share\\\\nltk_data'\n    - 'C:\\\\ProgramData\\\\Anaconda3\\\\lib\\\\nltk_data'\n    - 'C:\\\\Users\\\\Administrator\\\\AppData\\\\Roaming\\\\nltk_data'\n**********************************************************************\n",
      "\nDuring handling of the above exception, another exception occurred:\n",
      "\u001b[1;31mLookupError\u001b[0m                               Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-3-540a55a67af1>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mnltk\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcorpus\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mreuters\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      2\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'路透社语料库中的前5个文件:'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 3\u001b[1;33m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mreuters\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfileids\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;36m5\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      4\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'路透社语料库的类别: '\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      5\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mreuters\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcategories\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\nltk\\corpus\\util.py\u001b[0m in \u001b[0;36m__getattr__\u001b[1;34m(self, attr)\u001b[0m\n\u001b[0;32m    114\u001b[0m             \u001b[1;32mraise\u001b[0m \u001b[0mAttributeError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"LazyCorpusLoader object has no attribute '__bases__'\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    115\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 116\u001b[1;33m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__load\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    117\u001b[0m         \u001b[1;31m# This looks circular, but its not, since __load() changes our\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    118\u001b[0m         \u001b[1;31m# __class__ to something new:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\nltk\\corpus\\util.py\u001b[0m in \u001b[0;36m__load\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m     79\u001b[0m             \u001b[1;32mexcept\u001b[0m \u001b[0mLookupError\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     80\u001b[0m                 \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mroot\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnltk\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfind\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'{}/{}'\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msubdir\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mzip_name\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 81\u001b[1;33m                 \u001b[1;32mexcept\u001b[0m \u001b[0mLookupError\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;32mraise\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     82\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     83\u001b[0m         \u001b[1;31m# Load the corpus.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\nltk\\corpus\\util.py\u001b[0m in \u001b[0;36m__load\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m     76\u001b[0m         \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     77\u001b[0m             \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 78\u001b[1;33m                 \u001b[0mroot\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnltk\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfind\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'{}/{}'\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msubdir\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__name\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     79\u001b[0m             \u001b[1;32mexcept\u001b[0m \u001b[0mLookupError\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     80\u001b[0m                 \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mroot\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnltk\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfind\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'{}/{}'\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msubdir\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mzip_name\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\nltk\\data.py\u001b[0m in \u001b[0;36mfind\u001b[1;34m(resource_name, paths)\u001b[0m\n\u001b[0;32m    673\u001b[0m     \u001b[0msep\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;34m'*'\u001b[0m \u001b[1;33m*\u001b[0m \u001b[1;36m70\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    674\u001b[0m     \u001b[0mresource_not_found\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;34m'\\n%s\\n%s\\n%s\\n'\u001b[0m \u001b[1;33m%\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0msep\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmsg\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msep\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 675\u001b[1;33m     \u001b[1;32mraise\u001b[0m \u001b[0mLookupError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mresource_not_found\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    676\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    677\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mLookupError\u001b[0m: \n**********************************************************************\n  Resource \u001b[93mreuters\u001b[0m not found.\n  Please use the NLTK Downloader to obtain the resource:\n\n  \u001b[31m>>> import nltk\n  >>> nltk.download('reuters')\n  \u001b[0m\n  Searched in:\n    - 'C:\\\\Users\\\\Administrator/nltk_data'\n    - 'C:\\\\nltk_data'\n    - 'D:\\\\nltk_data'\n    - 'E:\\\\nltk_data'\n    - 'C:\\\\ProgramData\\\\Anaconda3\\\\nltk_data'\n    - 'C:\\\\ProgramData\\\\Anaconda3\\\\share\\\\nltk_data'\n    - 'C:\\\\ProgramData\\\\Anaconda3\\\\lib\\\\nltk_data'\n    - 'C:\\\\Users\\\\Administrator\\\\AppData\\\\Roaming\\\\nltk_data'\n**********************************************************************\n"
     ]
    }
   ],
   "source": [
    "from nltk.corpus import reuters\n",
    "print('路透社语料库中的前5个文件:')\n",
    "print(reuters.fileids()[:5])\n",
    "print('路透社语料库的类别: ')\n",
    "print(reuters.categories())\n",
    "print('文件“test/14828”的所属类别：%s'%(reuters.categories(\"test/14828\")))\n",
    "print('类别“crude”和“cron”对应的文件: ')\n",
    "print(reuters.fileids([\"crude\", \"cron\"]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "ename": "ModuleNotFoundError",
     "evalue": "No module named 'zhconv'",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-4-2a4380c84689>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0murllib\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrequest\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0murlopen\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[0mzhconv\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mconvert\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      3\u001b[0m \u001b[0murl\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;34m'https://www.gutenberg.org/files/23962/23962-0.txt'\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      4\u001b[0m \u001b[0mhtml\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0murlopen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mur1\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      5\u001b[0m \u001b[0mhtml\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mhtml\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdecode\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'utf-8'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'zhconv'"
     ]
    }
   ],
   "source": [
    "from urllib.request import urlopen\n",
    "from zhconv import convert\n",
    "url = 'https://www.gutenberg.org/files/23962/23962-0.txt'\n",
    "html = urlopen(ur1).read()\n",
    "html = html.decode('utf-8')\n",
    "html = convert(html[600:10000], 'zh-hzns')\n",
    "print(html)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "text ='电脑是20世纪人类最伟大的发明之一。按性能的不同，电脑可分为巨型机，大型机，小型机，工作站和微型电脑。'\n",
    "p_string = text.split(\"。\")\n",
    "for line in p_string:\n",
    "    if re.match('电脑', line)is not None:\n",
    "        print(\"句子“\",line, \"”是以“电脑”开头的\")\n",
    "        elif line:\n",
    "            print(\"句子“\",line, \"”不是以“电脑”开头的\")\n",
    "        if re.search('电脑',line) is not None:\n",
    "            line = re.sub('电脑','计算机', line)\n",
    "            print(\"将'电脑'替换为'计算机'后的结果:\", line)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
